원작자 : Maksym Shkliarevskyi
https://www.researchgate.net/profile/Maksym-Shkliarevskyi-2?fbclid=IwAR3bHR2k--e5yZOekorZFYQikVkzGf8Z4hcSY1z4TC0QUKtd5yePUxAQo34

1 Kaggle Survey 2017-2021

Kaggle에서 이용자들을 대상으로 매년 Survey를 수행하여 공개하는데 21년도에는 21.9.1 부터 21.10.4일까지 설문을 하여 총 25,973개의 답변을 공개

단답형 및 복수선택 답변들은 각각하나의 컬럼으로 구성되어 있음

2 필요한 라이브러리 로드 및 커스텀 테마, 색상 지정

library(tidyverse)
library(patchwork) #ggplot 그래프 이어붙이기 옆으로 이어붙일때는 + 아래로 이어붙일때는 슬래쉬 (/)
library(scales)
library(ggpubr) #geom braket
library(viridis) #color map

# Custom theme
theme_set(theme_minimal())
my_theme <- theme(plot.title = element_text(hjust = 0.5, face = 'bold', size = 18),
        plot.subtitle = element_text(hjust = 0.5, size = 13),
        axis.title = element_text(face = 'bold', size = 15),
        axis.text = element_text(size = 13))

# Custom palette
my_palette <- c('#FFCB3E', '#FB836F', '#C1549C', '#7E549F', '#33546D')
show_col(my_palette, ncol = 1, labels = T) 

3 데이터로드

3.1 2021 데이터

ks_2021 <- read_csv('/Users/spark/OneDrive/ps/Kaggle/Survey/2021/kaggle_survey_2021_responses.csv')
ks_2021_descriptions <- ks_2021[1,] # 설문행렬 별도 구분
ks_2021 <- ks_2021[2:nrow(ks_2021),] # 설문행렬 외에 나머지 부분 

3.2 2017-2020 데이터

ks_2020 <- read_csv('/Users/spark/OneDrive/ps/Kaggle/Survey/2020/kaggle_survey_2020_responses.csv')
ks_2020_descriptions <- ks_2020[1,]
ks_2020 <- ks_2020[2:nrow(ks_2020),]

ks_2019 <- read_csv('/Users/spark/OneDrive/ps/Kaggle/Survey/2019/multiple_choice_responses.csv')
ks_2019_descriptions <- ks_2019[1,]
ks_2019 <- ks_2019[2:nrow(ks_2019),]

ks_2018 <- read_csv('/Users/spark/OneDrive/ps/Kaggle/Survey/2018/multipleChoiceResponses.csv')
ks_2018_descriptions <- ks_2018[1,]
ks_2018 <- ks_2018[2:nrow(ks_2018),]

ks_2017 <- read_csv('/Users/spark/OneDrive/ps/Kaggle/Survey/2017/multipleChoiceResponses.csv')
ks_2017_descriptions <- ks_2017[1,]
ks_2017 <- ks_2017[2:nrow(ks_2017),]

4 Kaggle Survey의 2017~2021년 한국의 트렌드를 비교

나이, 성별, 교육수준, 재직여부, 경력 등을 연도별로 시각화하여 분석해보고자 함

4.1 연령

Q1 What is your age (# years)?

# 2021
kor_age21 <- ks_2021 %>% 
  filter(Q3 == 'South Korea') %>% 
  count(Q1) %>% 
  rename(Age = Q1, Count = n) %>% 
  ggplot(aes(Age, Count)) +
  geom_col(fill = my_palette[5], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  geom_label(aes(x = 9, y = 100, label = paste0('Responses: ', sum(Count))),
             col = my_palette[5], size = 6) +
  geom_bracket(xmin = 2.5, xmax = 4.5, y.position = 115, label = "44.3%", 
               tip.length = 0.02, size = 0.9, label.size = 6, color = my_palette[5])+
  labs(x = '', title = 'Age distribution (South Korea)',
       subtitle = '(2021)') +
  my_theme

# 2020
kor_age20 <- ks_2020 %>% 
  filter(Q3 == 'South Korea') %>% 
  count(Q1) %>% 
  rename(Age = Q1, Count = n) %>% 
  ggplot(aes(Age, Count)) +
  geom_col(fill = my_palette[4], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  geom_label(aes(x = 9, y = 40, label = paste0('Responses: ', sum(Count))),
             col = my_palette[4], size = 6) +
  labs(x = '', subtitle = '(2020)') +
  my_theme

# 2019
kor_age19 <- ks_2019 %>% 
  filter(Q3 == 'South Korea') %>% 
  count(Q1) %>% 
  rename(Age = Q1, Count = n) %>% 
  ggplot(aes(Age, Count)) +
  geom_col(fill = my_palette[3], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  geom_label(aes(x = 8.5, y = 45, label = paste0('Responses: ', sum(Count))),
             col = my_palette[3], size = 6) +
  labs(x = '', subtitle = '(2019)') +
  my_theme

# 2018
kor_age18 <- ks_2018 %>% 
  filter(Q3 == 'South Korea') %>% 
  count(Q2) %>% 
  rename(Age = Q2, Count = n) %>% 
  ggplot(aes(Age, Count)) +
  geom_col(fill = my_palette[2], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  geom_label(aes(x = 8.5, y = 60, label = paste0('Responses: ', sum(Count))),
             col = my_palette[2], size = 6) +
  labs(x = '', subtitle = '(2018)',
       caption = '') +
  my_theme

# 2017
ks_2017$Age_range <- cut(ks_2017$Age, c(0, 18, 22, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80, 100))
levels(ks_2017$Age_range) <- c("<18", "18-21","22-24","25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-69", "70-79", "80+")

kor_age17 <- ks_2017 %>% 
  filter(Country == 'South Korea') %>% 
  count(Age_range) %>% 
  rename(Age = Age_range, Count = n) %>% 
  slice(-n()) %>% 
  ggplot(aes(Age, Count)) +
  geom_col(fill = my_palette[1], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  geom_label(aes(x = 10, y = 50, label = paste0('Responses: ', sum(Count))),
             col = my_palette[1], size = 6) +
  labs(x = '', subtitle = '(2017)',
       caption = '\u00A9 Seung Park') +
  my_theme

design <- 'AAAAAAAA
BBBBCCCC
DDDDEEEE'

kor_age21 + kor_age20 + kor_age19 + kor_age18 + kor_age17 + plot_layout(design = design)

2021년에는 359명이 응답하였고 20대중후반30% 30대초중반15%의 비율을 갖고 있음

4.2 남녀성비

Q2 What is your gender? - Selected Choice

kor_gender21 <- ks_2021 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q2) %>% 
  rename(Gender = Q2, Count = n) %>% 
  ggplot(aes(Count, reorder(Gender, Count))) +
  geom_col(fill = my_palette[5], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', title = 'Gender distribution (South Korea)',
       subtitle = '(2021)') +
  my_theme

# 2020
kor_gender20 <- ks_2020 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q2) %>% 
  rename(Gender = Q2, Count = n) %>% 
  ggplot(aes(Count, reorder(Gender, Count))) +
  geom_col(fill = my_palette[4], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2020)') +
  my_theme

# 2019
kor_gender19 <- ks_2019 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q2) %>% 
  rename(Gender = Q2, Count = n) %>% 
  ggplot(aes(Count, reorder(Gender, Count))) +
  geom_col(fill = my_palette[3], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2019)') +
  my_theme

# 2018
kor_gender18 <- ks_2018 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q1) %>% 
  rename(Gender = Q1, Count = n) %>% 
  ggplot(aes(Count, reorder(Gender, Count))) +
  geom_col(fill = my_palette[2], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2018)',
       caption = '') +
  my_theme

# 2017
kor_gender17 <- ks_2017 %>% 
  filter(Country == 'South Korea') %>%
  count(GenderSelect) %>% 
  rename(Gender = GenderSelect, Count = n) %>% 
  mutate(Gender = str_replace(Gender, 'Non-binary, genderqueer, or gender non-conforming', 'Non-binary')) %>%
  mutate(Gender = str_replace_na(Gender, 'NA')) %>%
  filter(Gender != 'NA') %>% 
  ggplot(aes(Count, reorder(Gender, Count))) +
  geom_col(fill = my_palette[1], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2017)',
       caption = '\u00A9 Seung Park') +
  my_theme

design <- 'AAAAAAAA
BBBBCCCC
DDDDEEEE'

kor_gender21 + kor_gender20 + kor_gender19 + kor_gender18 + kor_gender17 + plot_layout(design = design)

남녀의 비율은 전체적으로 8:2

4.3 연령대별 남녀비율

genders <- c('Nonbinary', 'Woman', 'Man')
kor_gend_age21 <- ks_2021 %>%
  filter(Q3 == 'South Korea') %>% 
  select(Q2, Q1) %>% 
  group_by(Q2, Q1) %>% 
  count() %>% 
  ungroup() %>% 
  rename(Gender = Q2, Age = Q1, Count = n) %>% 
  filter(Gender %in% genders) %>%
  mutate(rate = Count / sum(Count)) %>% 
  complete(Gender, 
           Age = c("18-21","22-24","25-29", "30-34", "35-39", "40-44", 
                   "45-49", "50-54", "55-59", "60-69"), 
           fill = list(rate = 0))
kor_gend_age21$Gender <- factor(kor_gend_age21$Gender, levels = genders)

kor_gend_age21 %>% 
  ggplot(aes(Age, Gender)) +
  geom_tile(aes(fill = rate), show.legend = FALSE, color = "gray30") +
  geom_text(aes(label = label_percent(accuracy = 0.01)(rate)), size = 5) +
  scale_fill_viridis(option = 'viridis', begin = 0.4, end = 1) +
  labs(title = 'Gender by Age (South Korea)',
       subtitle = '(2021)',
       caption = '\u00A9 Seung Park') +
  my_theme +
  theme(panel.grid = element_blank())

4.4 학력

Q4 What is the highest level of formal education that you have attained or plan to attain within the next 2 years ?(2017제외)

# 2021
kor_ed21 <- ks_2021 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q4) %>% 
  mutate(Q4 = str_replace(Q4, 'No formal education past high school', 'High school')) %>% 
  mutate(Q4 = str_replace(Q4, 'Some college/university study without earning a bachelor’s degree', 'Some college/university')) %>%
  rename(Education = Q4, Count = n) %>% 
  filter(Education != 'I prefer not to answer') %>% 
  ggplot(aes(Count, reorder(Education, Count))) +
  geom_col(fill = my_palette[5], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2021)') +
  my_theme

# 2020
kor_ed20 <- ks_2020 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q4) %>% 
  mutate(Q4 = str_replace(Q4, 'No formal education past high school', 'High school')) %>% 
  mutate(Q4 = str_replace(Q4, 'Some college/university study without earning a bachelor’s degree', 'Some college/university')) %>%
  rename(Education = Q4, Count = n) %>% 
  filter(Education != 'I prefer not to answer') %>% 
  ggplot(aes(Count, reorder(Education, Count))) +
  geom_col(fill = my_palette[4], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2020)') +
  my_theme

# 2019
kor_ed19 <- ks_2019 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q4) %>% 
  mutate(Q4 = str_replace(Q4, 'No formal education past high school', 'High school')) %>% 
  mutate(Q4 = str_replace(Q4, 'Some college/university study without earning a bachelor’s degree', 'Some college/university')) %>%
  rename(Education = Q4, Count = n) %>% 
  filter(Education != 'I prefer not to answer') %>% 
  ggplot(aes(Count, reorder(Education, Count))) +
  geom_col(fill = my_palette[3], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2019)',
       caption = '') +
  my_theme

# 2018
kor_ed18 <- ks_2018 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q4) %>% 
  mutate(Q4 = str_replace(Q4, 'No formal education past high school', 'High school')) %>% 
  mutate(Q4 = str_replace(Q4, 'Some college/university study without earning a bachelor’s degree', 'Some college/university')) %>%
  rename(Education = Q4, Count = n) %>% 
  filter(Education != 'I prefer not to answer') %>% 
  ggplot(aes(Count, reorder(Education, Count))) +
  geom_col(fill = my_palette[2], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(Count / sum(Count)))) +
  labs(y = '', subtitle = '(2018)',
       caption = '\u00A9 Seung Park') +
  my_theme

design <- 'AAAABBBB
CCCCDDDD'

kor_ed21 + kor_ed20 + kor_ed19 + kor_ed18 + plot_layout(design = design) + 
  plot_annotation(title = "Education level distribution (South Korea)", theme = my_theme)

2021년에는 학사36.6%으로 석사 26.5%보다 높았음 2020년에는 학사26.7% 석사32.6%였음

4.5 학생 또는 재직여부

Q5 Select the title most similar to your current role (or most recent title if retired)

kor_role21 <- ks_2021 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q5) %>% 
  arrange(-n) %>% 
  rename(Role = Q5, Count = n) %>% 
  mutate(Rate = Count/sum(Count))

kor_role20 <- ks_2020 %>%
  filter(Q3 == 'South Korea') %>%
  count(Q5) %>% 
  arrange(-n) %>% 
  rename(Role = Q5, Count = n) %>% 
  mutate(Rate = Count/sum(Count))

kor_role19 <- ks_2019 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q5) %>% 
  arrange(-n) %>% 
  rename(Role = Q5, Count = n) %>% 
  mutate(Rate = Count/sum(Count))

kor_role18 <- ks_2018 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q6) %>% 
  arrange(-n) %>% 
  rename(Role = Q6, Count = n) %>% 
  mutate(Rate = Count/sum(Count))

kor_role17 <- ks_2017 %>% 
  filter(Country == 'South Korea') %>%
  count(CurrentJobTitleSelect) %>% 
  arrange(-n) %>% 
  rename(Role = CurrentJobTitleSelect, Count = n) %>% 
  mutate(Rate = Count/sum(Count))


kor_roles <- kor_role21[,c(1, 3)] %>% 
  left_join(kor_role20[,c(1, 3)], by = 'Role') %>% 
  left_join(kor_role19[,c(1, 3)], by = 'Role') %>% 
  left_join(kor_role18[,c(1, 3)], by = 'Role') %>% 
  left_join(kor_role17[,c(1, 3)], by = 'Role')
colnames(kor_roles) <- c('Role', '2021', '2020', '2019', '2018', '2017')


kor_r21 <- kor_role21 %>% 
  ggplot(aes(Count, reorder(Role, Count))) +
  geom_col(fill = my_palette[5], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.01)(Rate))) +
  labs(y = 'Position', title = 'Current position (South Korea)', 
       subtitle = '(2021)') +
  my_theme


kor_r_20_17 <- kor_roles %>% 
  mutate(`2020` = `2020`-`2021`,
         `2019` = `2019`-`2021`,
         `2018` = `2018`-`2021`,
         `2017` = `2017`-`2021`) %>% 
  select(1, `2020`:`2017`) %>% 
  gather(`2020`:`2017`, key = Year, value = Value) %>% 
  mutate(hjust = if_else(Value > 0, -0.5, 1.5)) %>% 
  ggplot(aes(y = Role, x = Value, color = Value > 0)) +
  geom_vline(xintercept = 0, color = 'gray50')+ #수직축 설정
  geom_point(stat = "identity", show.legend = FALSE, size = 7) +
  geom_segment(aes(x = 0, xend = Value,
                   y = Role, yend = Role), size = 1.2, show.legend = FALSE) +
  geom_label(aes(label = label_percent(accuracy = 0.01)(Value), hjust = hjust), 
             show.legend = FALSE, size = 4) +
  scale_x_continuous(limits = c(-0.1, 0.1), 
                     breaks = c(-0.08, -0.04, 0, 0.04, 0.08),
                     labels = label_percent(accuracy = 1)(c(-0.08, -0.04, 0, 0.04, 0.08))) +
  scale_color_manual(values = c('#E61C5D', '#0A516D')) +
  facet_wrap(~Year, nrow = 2, as.table = FALSE) +
  labs(title = 'Current positions by past years (South Korea)',
       subtitle = '(differences in comparison with 2021)',
       x = 'Difference, %',
       y = 'Position',
       caption = '\u00A9 Seung Park') +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid = element_line(linetype = 'dashed', size = 0.4),
        strip.text = element_text(size = 13, face = 'bold'),
        strip.background = element_rect(color = 'black', fill = 'gray95'),
        strip.switch.pad.wrap = unit(500, 'mm')) +
  my_theme

design <- 'AAAAAA
BBBBBB
BBBBBB'

kor_r21 + kor_r_20_17 + plot_layout(design = design)

2021년 응답자중 27.3%학사 또는 석사 학생이였으며 2020년도는 25%가 학생이였음

4.5.1 학력수준에 따른 현재 직업

levels <- c("High school", "Some college/university", "Bachelor’s degree", 
            "Master’s degree", "Professional doctorate", "Doctoral degree")

kor_exp_ed21 <- ks_2021 %>%
  filter(Q3 == 'South Korea') %>%
  select(Q5, Q4) %>% 
  group_by(Q5, Q4) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(Q4 = str_replace(Q4, 'No formal education past high school', 'High school')) %>% 
  mutate(Q4 = str_replace(Q4, 'Some college/university study without earning a bachelor’s degree', 'Some college/university')) %>%
  rename(Position = Q5, Education = Q4, Count = n) %>% 
  filter(Education != 'I prefer not to answer') %>% 
  mutate(rate = Count / sum(Count)) %>% 
  complete(Position, Education = levels, fill = list(rate = 0))
kor_exp_ed21$Education <- factor(kor_exp_ed21$Education, levels = levels)

kor_exp_ed21 %>% 
  ggplot(aes(Education, reorder(Position, Count))) +
  geom_tile(aes(fill = rate), show.legend = FALSE, color = "gray30") +
  geom_text(aes(label = label_percent(accuracy = 0.01)(rate)), size = 5) +
  scale_fill_viridis(option = 'viridis', begin = 0.4, end = 1) +
  labs(y = 'Position',
       title = 'Position by Education level (South Korea)',
       subtitle = '(2021)',
       caption = '\u00A9 Seung Park') +
  my_theme +
  theme(panel.grid = element_blank())

4.5.2 직업별 경력

Q6 For how many years have you been writing code and/or programming?

exp_lev <- c('I have never written code', '< 1', '1-3', '3-5', '5-10', '10-20', '20+')

kor_exp21 <- ks_2021 %>% 
  filter(Q3 == 'South Korea') %>%
  select(Q5:Q6) %>% 
  mutate(Q6 = str_replace(Q6, ' years', '')) %>% 
  group_by(Q5, Q6) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(rate = n / sum(n)) %>% 
  rename(Position = Q5, Experience = Q6, Count = n) %>% 
  complete(Position, Experience = exp_lev, fill = list(rate = 0))
kor_exp21$Experience <- factor(kor_exp21$Experience, levels = exp_lev)

kor_exp21 %>% 
  ggplot(aes(Experience, reorder(Position, Count))) +
  geom_tile(aes(fill = rate), show.legend = FALSE, color = "gray30") +
  geom_text(aes(label = label_percent(accuracy = 0.01)(rate)), size = 5) +
  scale_fill_viridis(option = 'viridis', begin = 0.4, end = 1) +
  labs(y = 'Position',
       title = 'Position by Experience groups',
       subtitle = '(2021)',
       caption = '\u00A9 Seung Park') +
  my_theme +
  theme(panel.grid = element_blank())

4.6 사용하는 언어

Q7 What programming languages do you use on a regular basis?(중복답변허용)

kor_lang2021 <- ks_2021 %>% 
  filter(Q3 == 'South Korea') %>%
  select(Q1:Q7_OTHER) %>% 
  mutate(id = 1:n()) %>% 
  gather(Q7_Part_1:Q7_OTHER, key = 'key', value = 'Language') %>% 
  filter(!is.na(Language))

kor_lang2020 <- ks_2020 %>%
  filter(Q3 == 'South Korea') %>%
  select(Q1:Q7_OTHER) %>% 
  mutate(id = 1:n()) %>% 
  gather(Q7_Part_1:Q7_OTHER, key = 'key', value = 'Language') %>% 
  filter(!is.na(Language))

kor_lang2019 <- ks_2019 %>% 
  filter(Q3 == 'South Korea') %>%
  select(c(Q1, Q2, Q3, Q4, Q5), Q18_Part_1:Q18_Part_12) %>% 
  mutate(id = 1:n()) %>% 
  gather(Q18_Part_1:Q18_Part_12, key = 'key', value = 'Language') %>% 
  filter(!is.na(Language))

kor_lang2018 <- ks_2018 %>% 
  filter(Q3 == 'South Korea') %>%
  select(c(Q1, Q2, Q3, Q4, Q6), Q16_Part_1:Q16_Part_18) %>% 
  mutate(id = 1:n()) %>% 
  gather(Q16_Part_1:Q16_Part_18, key = 'key', value = 'Language') %>% 
  filter(!is.na(Language))


kor_lang_n21 <- kor_lang2021 %>% 
  count(Language) %>% 
  arrange(-n) %>% 
  mutate(rate = n / nrow(ks_2021 %>% filter(Q3 == 'South Korea'))) %>% 
  rename(Count = n) %>% 
  ggplot(aes(reorder(Language, -Count), Count)) +
  geom_col(fill = my_palette[5], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(rate))) +
  labs(x = '', title = 'Programming languages that used on a regular basis (South Korea)',
       subtitle = '(2021)') +
  my_theme

kor_lang_n20 <- kor_lang2020 %>% 
  count(Language) %>% 
  arrange(-n) %>% 
  mutate(rate = n / nrow(ks_2020 %>% filter(Q3 == 'South Korea'))) %>% 
  rename(Count = n) %>% 
  ggplot(aes(reorder(Language, -Count), Count)) +
  geom_col(fill = my_palette[4], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(rate))) +
  labs(x = '', subtitle = '(2020)') +
  my_theme

kor_lang_n19 <- kor_lang2019 %>% 
  count(Language) %>% 
  arrange(-n) %>% 
  mutate(rate = n / nrow(ks_2019 %>% filter(Q3 == 'South Korea'))) %>% 
  rename(Count = n) %>% 
  ggplot(aes(reorder(Language, -Count), Count)) +
  geom_col(fill = my_palette[3], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(rate))) +
  labs(x = '', subtitle = '(2019)') +
  my_theme

kor_lang_n18 <- kor_lang2018 %>% 
  count(Language) %>% 
  arrange(-n) %>% 
  mutate(rate = n / nrow(ks_2018 %>% filter(Q3 == 'South Korea'))) %>% 
  rename(Count = n) %>% 
  mutate(Language = str_replace(Language, 'Javascript/Typescript', 'Javascript/\nTypescript')) %>% 
  mutate(Language = str_replace(Language, 'Visual Basic/VBA', 'Visual Basic/\nVBA')) %>% 
  ggplot(aes(reorder(Language, -Count), Count)) +
  geom_col(fill = my_palette[2], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(rate))) +
  labs(x = '', subtitle = '(2018)',
       caption = '\u00A9 Seung Park') +
  my_theme

design <- 'AAAA
BBBB
CCCC
DDDD'

kor_lang_n21 + kor_lang_n20 + kor_lang_n19 + kor_lang_n18 + plot_layout(design = design)

2021년에 응답자의 84.4%가 파이썬을 사용한다고 답변하였고 이는 2020년 대비 파이썬의 사용자가 10%이상 증가하였음
반면 R 사용자는 24.8% 로 전기 23.7%대비 소폭 증가하였음

4.7 추천 언어

Q8 What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice

ks_2021 %>% 
  filter(Q3 == 'South Korea') %>%
  count(Q8) %>% 
  filter(!is.na(Q8)) %>% 
  arrange(-n) %>% 
  rename(Language = Q8, Count = n) %>% 
  mutate(rate = Count/sum(Count)) %>% 
  ggplot(aes(reorder(Language, -Count), Count)) +
  geom_col(fill = my_palette[5], color = 'black') +
  geom_label(aes(label = label_percent(accuracy = 0.1)(rate))) +
  labs(x = '', title = 'Programming languages that you recommend to learn first (South Korea)',
       subtitle = '(2021)') +
  my_theme

응답자의 81.5%가 파이썬을 추천하였음